#!/bin/bash
#
# Copyright (c) 2012-2015 Huawei .
# All rights reserved.
#
#
# Function
# revise note
######################################
set +x
WORK_PATH=$(cd "$(dirname "$BASH_SOURCE")";pwd)
source ${WORK_PATH}/ebackup_ms_profile.sh

G_BACKUP_SERVER=0
G_BACKUP_PROXY=1
G_BACKUPMANAGER=2
G_BACKUPWORKFLOW=3
G_PUBLIC_CONF_FILE="${G_PUBLIC_CONF_FILE}"
export HCP_ROOT=${G_HOME_PATH}
export LD_LIBRARY_PATH=$HCP_ROOT/libs:$LD_LIBRARY_PATH
export PATH=${HCP_ROOT}/sbin:/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/sbin:/root/bin:/usr/local/bin:$PATH

G_FLOAT_IP_BAK=${HCP_ROOT}/conf/floatIpBak
G_hcpconf_path="${HCP_ROOT}/conf/hcpconf.ini"
G_OS_TYPE="" 
G_IFCONFIG="$(which ifconfig)"
hcp_script_logfile="/opt/huawei-data-protection/ebackup/logs/script.log"

G_MANAGE_FIP_PORT=""

pm_name="HCPProcessMonitor OmmHa ebk_openstack_vm_monitor ebk_openstack_vmware_monitor ebk_mgr_monitor ebk_backup_monitor ebk_restore_monitor ebk_delete_monitor ebk_copy_monitor ebk_vmware_monitor ebk_accelerator_monitor ebk_fsbackup_monitor ebk_jobmanager_monitor"
pm_alm_nm="/opt/huawei-data-protection/ebackup/db/bin/gaussdb /bin/AdminNode /bin/HCPProcessMonitor ./ibase ./dsware_agent /ha/module/hamon/bin/ha_monitor /ha/module/hacom/bin/ha.bin /bin/hcplogrotate.sh /microservice/ebk_governance"
pm_proxy_out="/opt/huawei-data-protection/ebackup/db/bin/gaussdb /bin/AdminNode ./ibase /ha/module/hamon/bin/ha_monitor /ha/module/hacom/bin/ha.bin /microservice/ebk_governance"
pm_standby_out="/bin/AdminNode ./ibase /microservice/ebk_governance"


G_HA_GET_ROLE_SCRIPT="/opt/huawei-data-protection/ebackup/ha/module/hacom/script/get_harole.sh"
G_HA_ROLE=""
G_MACHINE_ROLE=`cat /opt/huawei-data-protection/ebackup/conf/hcpconf.ini |grep MachineRole |awk -F "=" '{print $2}'`
G_MACHINE_ROLE_SERVER=0
G_MACHINE_ROLE_MANAGER=2
G_MACHINE_ROLE_PROXY=1
G_MACHINE_ROLE_WORKFLOW=3

G_HCP_PROCESS_ACCOUNT=hcpprocess
G_HA_NAT_LOST_TIME_FILE=$HCP_ROOT/conf/ha_nat_lost_time
G_HA_NAT_LOST_TIME=4


#errcode
EX_GENERAL_LIMIT_LISTEN_FAILED=31

pm_pid_file="${HCP_ROOT}/tmp/process_alarm.pid"
ha_config_tmp_file="${HCP_ROOT}/tmp/config_ha"
ha_status_sh="${HCP_ROOT}/ha/module/hacom/script/status_ha.sh"
pm_log_file="${HCP_ROOT}/logs/script.log"
G_UPGRADE_FLAG_FILE="${HCP_ROOT}/tmp/upgrade_running"
hcp_pm_crontab_chk_file=${HCP_ROOT}/bin/hcp_pm_crontab_chk.sh
#The tag of check the CPU usage and kill the high CPU process.Just kill one in one check
G_IS_KILL_HIGH_CPU_PROCESS=0
source ${HCP_ROOT}/sbin/log.sh
source ${HCP_ROOT}/bin/event_lib.sh

if [ -L "${G_hcpconf_path}" ];then
    echo "symbol link File."
    exit 1
fi

IP_FLAG=`cat ${G_hcpconf_path}| grep "NetworkType" | awk -F '=' '{print $2}'`
if [ ${IP_FLAG} = "ipv6" ];then                 
    G_IPTABLES="$(which ip6tables)"
    G_IPTABLES_SAVE="$(which ip6tables-save)"
    G_IPTABLES_RESTORE="$(which ip6tables-restore)"
    G_IPTABLES_FILE_PATH=/etc/sysconfig/ip6tables
    G_IPTABLES_FILE="ip6tables"
    G_PING="$(which ping6)"
    G_LOCALHOST="::1"
else
    G_IPTABLES="$(which iptables)"
    G_IPTABLES_SAVE="$(which iptables-save)"
    G_IPTABLES_RESTORE="$(which iptables-restore)"
    G_IPTABLES_FILE_PATH=/etc/sysconfig/iptables
    G_IPTABLES_FILE="iptables"
    G_PING="$(which ping)"
    G_LOCALHOST="127.0.0.1"
fi

#******************************************************************#
# Function: hcp_ctl
# Description: Run command with HCP account
# Input Parameters: command
# None
# Return :
#******************************************************************#
function hcp_ctl() 
{
    log_info "hcp_ctl:Run command with HCP account"
    /bin/sh -c "HCP_ROOT=$HCP_ROOT PATH=$PATH LD_LIBRARY_PATH=$LD_LIBRARY_PATH $@ &"
}

#******************************************************************#
# Function: float_ip_check 
# Description: check flock ip status and send alarm 
# Input Parameters: None
# Return :
#******************************************************************#
float_ip_check()
{
    if [ -f ${G_UPGRADE_FLAG_FILE} ]; then
        log_info "[pm_pid_monitor] upgrade is running, so quit."
        return 1
    fi
    
    local l_ha_mode=$(GetHAMode)
    if [[ "${l_ha_mode}" == "single" ]];then
        log_info "[pm_pid_monitor] ha mode is single, so quit."
        return 1
    fi

    local l_float_ip=$(GetFloatIP)
    if [ -z "${l_float_ip}" ];then
        return 1
    fi

    ${G_PING} ${l_float_ip} -c 3 >/dev/null 2>&1
    if [ $? -ne 0 ]; then
        log_warn "floatIP ${l_float_ip} can not reached !"
        ProcessFloatIPConnAbnormal 0
    else
        ProcessFloatIPConnAbnormal 1
    fi      

    return 0
}

#******************************************************************#
# Function: check_proc_status
# Description: check the process status
# Input Parameters:
#    $1: process path
#    $2: process name
#    $3: exclude string
# Return:
#    0: process is running
#    1: execute ps failed
#    2: not running
#    3: status is T and has been killed
#    4: status is Z and has been killed
#******************************************************************#
function check_proc_status()
{
    local L_PROC_PATH=$1
    local L_PROC_NAME=$2
    local L_EXCLUDE_STRING=$3
    
    local L_MAX_CHECK_NUM=3
    local L_CHECK_INTERVAL=1
    
    local L_CHECK_NUM=0
    local L_PROC_INFO=""
    while [ ${L_CHECK_NUM} -le ${L_MAX_CHECK_NUM} ]
    do
        L_CHECK_NUM=`expr ${L_CHECK_NUM} + 1`
        L_PROC_INFO=`/bin/ps -elf 2>/dev/null`
        if [ 0 -ne $? ];then
            log_error "Execute ps command error, process(dir:${L_PROC_PATH}, name:${L_PROC_NAME})."
            return 1
        fi
        local L_PROC_NAME_TMP="${L_PROC_NAME:0:15}" #if the process status is Z, only show the top 15 byte, eg: ebk_backup_monitor --> [ebk_backup_moni] <defunct>
        L_PROC_INFO=`echo "${L_PROC_INFO}" | grep -E -v "${L_EXCLUDE_STRING}" | grep -E "\[${L_PROC_NAME_TMP}\] <defunct>" | sed -n "1p" 2>/dev/null`
        if [ "" == "${L_PROC_INFO}" ];then
            break
        fi
        #find status is Z
        local L_PROC_STATUS=`echo "${L_PROC_INFO}" | awk '{print $2}'`
        if [ "Z" == "${L_PROC_STATUS}" ];then
            local L_PROC_PID=`echo "${L_PROC_INFO}" | awk '{print $4}'`
            local L_PROC_PPID=`echo "${L_PROC_INFO}" | awk '{print $5}'`
            kill -HUP ${L_PROC_PPID}
            local L_ret=$?
            if [ ${L_CHECK_NUM} -ge ${L_MAX_CHECK_NUM} ];then
                log_error "The process(dir:${L_PROC_PATH}, name:${L_PROC_NAME}, pid:${L_PROC_PID}, ppid:${L_PROC_PPID}) status is Z, kill -HUP ${L_PROC_PPID} result is ${L_ret}."
                return 4
            else
                log_warn "The process(dir:${L_PROC_PATH}, name:${L_PROC_NAME}, pid:${L_PROC_PID}, ppid:${L_PROC_PPID}) status is Z, kill -HUP ${L_PROC_PPID} result is ${L_ret}, check times:${L_CHECK_NUM}."
                sleep ${L_CHECK_INTERVAL}
                continue
            fi
        fi
        
        break
    done
    
    L_CHECK_NUM=0
    while [ ${L_CHECK_NUM} -le ${L_MAX_CHECK_NUM} ]
    do
        L_CHECK_NUM=`expr ${L_CHECK_NUM} + 1`
        L_PROC_INFO=`/bin/ps -elf 2>/dev/null`
        if [ 0 -ne $? ];then
            log_error "Execute ps command error, process(dir:${L_PROC_PATH}, name:${L_PROC_NAME})."
            return 1
        fi
        L_PROC_INFO=`echo "${L_PROC_INFO}" | grep -E -v "${L_EXCLUDE_STRING}" | grep -E "${L_PROC_PATH}" | sed -n "1p" 2>/dev/null`
        if [ "" == "${L_PROC_INFO}" ];then
            log_error "The process(dir:${L_PROC_PATH}, name:${L_PROC_NAME}) not found."
            return 2
        fi
        local L_PROC_STATUS=`echo "${L_PROC_INFO}" | awk '{print $2}'`
        if [ "T" == "${L_PROC_STATUS}" ];then #find status is T
            local L_PROC_PID=`echo "${L_PROC_INFO}" | awk '{print $4}'`
            if [ ${L_CHECK_NUM} -ge ${L_MAX_CHECK_NUM} ];then
                kill -9 ${L_PROC_PID}
                local L_ret=$?
                log_error "The process(dir:${L_PROC_PATH}, name:${L_PROC_NAME}, pid:${L_PROC_PID}) status is T, kill -9 ${L_PROC_PID} result is ${L_ret}."
                return 3
            else
                log_warn "The process(dir:${L_PROC_PATH}, name:${L_PROC_NAME}, pid:${L_PROC_PID}) status is T, check times:${L_CHECK_NUM}."
                sleep ${L_CHECK_INTERVAL}
                continue
            fi
        fi
        
        break
    done
    
    return 0
}


#******************************************************************#
# Function: handle_process_cpu_status
# Description: check process use cpu status,if the cpu is greater 150,kill the process and wait start by crontab 
# Input Parameters: None
# Return :
#******************************************************************#
function handle_process_cpu_status()
{
 
     #Check the CPU usage of process
      if [ $# -ne 2 ];then 
          log_info "The param check failed.Need 2 prams."
          rerurn 
      fi
      if [ "X${G_IS_KILL_HIGH_CPU_PROCESS}" == "X1"  ];then
           log_info "The cpu check,one check can kill only one process."
           return 
      fi
 
      local serviceName=$1
      local process_name=$2 
        
      local L_PID=` /bin/ps -elf |grep "${HCP_ROOT}/microservice/${serviceName}/bin ${process_name}"|grep -v "grep"|awk '{print $4}'`
      local  L_CPUUSAGE
      local -i L_CPU_INT
      if [ "X${L_PID}" != "X" ];then
           L_CPUUSAGE=`/bin/ps -p ${L_PID} -o pcpu|egrep -v "CPU"|awk '{print $1}'`
           L_CPU_INT=`echo "${L_CPUUSAGE}"|awk '{print int($0)}'`  
           if [ ${L_CPU_INT} -ge 150 ];then
                 sleep 10
                 L_CPUUSAGE=`/bin/ps -p ${L_PID} -o pcpu|egrep -v "CPU"|awk '{print $1'}`
                 L_CPU_INT=`echo "${L_CPUUSAGE}"|awk '{print int($0)}'`
                 if [ ${L_CPU_INT} -ge 150 ];then
                      log_info "The cpu of ${process_name} is greater than 150.Kill the process to recover normal."
                      kill -9 $L_PID >/dev/null 2>&1
                      G_IS_KILL_HIGH_CPU_PROCESS=1
                      echo "kill ok"
                      sleep 5
                fi
           fi
           echo "Test is ok"
      fi
}





#******************************************************************#
# Function: pm_status
# Description: get the hcp processmonitor process status
# Input Parameters: None
# Return :
#******************************************************************#
pm_status()
{
    local process_name="$1"
   
    if [ "${process_name}" == "HCPProcessMonitor" ];then 
        check_proc_status "${HCP_ROOT}/bin/${process_name}" "${process_name}" "(grep|sh)"
        local L_ret=$?
        if [ 1 -ne ${L_ret} ] && [ 0 -ne ${L_ret} ];then
            return 1
        fi
    elif [ "${process_name}" == "OmmHa" ];then 
        check_proc_status "${HCP_ROOT}/ha/module/hamon/bin/ha_monitor" "ha_monitor" "(grep)"
        local L_ret=$?
        if [ 1 -ne ${L_ret} ] && [ 0 -ne ${L_ret} ];then
            return 1
        fi
        check_proc_status "${HCP_ROOT}/ha/module/hacom/bin/ha.bin" "ha.bin" "(grep)"
        L_ret=$?
        if [ 1 -ne ${L_ret} ] && [ 0 -ne ${L_ret} ];then
            return 1
        fi
    elif [ "${process_name}" == "ebk_openstack_vm_monitor" ];then
        local role=`cat ${G_PUBLIC_CONF_FILE} | grep "MachineRole" | cut -d'=' -f2`
        if [ "${role}" == "${G_BACKUPMANAGER}" -o "${role}" == "${G_BACKUPWORKFLOW}" ];then
            check_proc_status "${HCP_ROOT}/microservice/ebk_openstack_vm/bin/${process_name}" "${process_name}" "(grep)"
            local L_ret=$?
            if [ 1 -ne ${L_ret} ] && [ 0 -ne ${L_ret} ];then
                return 1
            fi
        fi
    elif [ "${process_name}" == "ebk_openstack_vmware_monitor" ];then
        local role=`cat ${G_PUBLIC_CONF_FILE} | grep "MachineRole" | cut -d'=' -f2`
        if [ "${role}" == "${G_BACKUPMANAGER}" -o "${role}" == "${G_BACKUPWORKFLOW}" ];then
            check_proc_status "${HCP_ROOT}/microservice/ebk_openstack_vmware/bin/${process_name}" "${process_name}" "(grep)"
            local L_ret=$?
            if [ 1 -ne ${L_ret} ] && [ 0 -ne ${L_ret} ];then
                return 1
            fi
        fi
    elif [ "${process_name}" == "ebk_backup_monitor" -o "${process_name}" == "ebk_restore_monitor" -o "${process_name}" == "ebk_delete_monitor" -o "${process_name}" == "ebk_copy_monitor" -o "${process_name}" == "ebk_vmware_monitor" -o "${process_name}" == "ebk_accelerator_monitor" -o "${process_name}" == "ebk_mgr_monitor" -o "${process_name}" == "ebk_wa_chains_monitor" -o "${process_name}" == "ebk_wa_index_monitor" -o "${process_name}" == "ebk_wa_defrag_monitor" -o "${process_name}" == "ebk_fsbackup_monitor" -o "${process_name}" == "ebk_jobmanager_monitor" ];then
        local role=`cat ${G_PUBLIC_CONF_FILE} | grep "MachineRole" | cut -d'=' -f2`
        if [ "${role}" == "${G_BACKUP_SERVER}" -o "${role}" == "${G_BACKUP_PROXY}" -o "${process_name}" == "ebk_mgr_monitor" -o "${process_name}" == "ebk_jobmanager_monitor" ];then
            local serviceName=`echo -e ${process_name} | awk -F '_monitor' '{print $1}'`
            handle_process_cpu_status ${serviceName} ${process_name}
            check_proc_status "${HCP_ROOT}/microservice/${serviceName}/bin ${process_name}" "${process_name}" "(grep)"
            local L_ret=$?
            echo $L_ret
            if [ 1 -ne ${L_ret} ] && [ 0 -ne ${L_ret} ];then
                return 1
            fi
        fi
    fi
    
    return 0
}

#******************************************************************#
# Function: pm_start
# Description: start the hcp processmonitor process 
# Input Parameters: None
# Return :
#******************************************************************#
pm_start()
{
    local process_name="$1"
    grep "${HCP_ROOT}/bin/${process_name}" /proc/[0-9]*/cmdline 1>/dev/null 2>&1
    if [ 0 -eq $? ];then
        return 0
    fi
  
    echo "starting $process_name"
    log_info "pm_start: start the hcp processmonitor process(${process_name})."
    if [ "${process_name}" == "HCPProcessMonitor" ];then 
        hcp_ctl ${HCP_ROOT}/bin/${process_name} 1>/dev/null 2>&1
        sleep 1
    elif [ "${process_name}" == "OmmHa" ];then
        local role=`cat ${G_PUBLIC_CONF_FILE} | grep "MachineRole" | cut -d'=' -f2`
        if [[ "$role" == "${G_BACKUP_SERVER}" && ! -f "${HCP_ROOT}/conf/HAAddNode" ]];then 
            ${HCP_ROOT}/ha/module/hamon/script/start_ha_monitor.sh
            sleep 1
        fi
        if [[ "$role" == "${G_BACKUPMANAGER}" && ! -f "${HCP_ROOT}/conf/HAAddNode" ]];then 
            ${HCP_ROOT}/ha/module/hamon/script/start_ha_monitor.sh
            sleep 1
        fi
    elif [ "${process_name}" == "ebk_openstack_vm_monitor" ];then
         local role=`cat ${G_PUBLIC_CONF_FILE} | grep "MachineRole" | cut -d'=' -f2`
         if [ "${role}" == "${G_BACKUPMANAGER}" -o "${role}" == "${G_BACKUPWORKFLOW}" ];then
            source ${HCP_ROOT}/microservice/ebk_openstack_vm/script/ebackup_env.sh
            sh ${HCP_ROOT}/microservice/ebk_openstack_vm/script/start.sh
            sleep 1
         fi
    elif [ "${process_name}" == "ebk_openstack_vmware_monitor" ];then
        local role=`cat ${G_PUBLIC_CONF_FILE} | grep "MachineRole" | cut -d'=' -f2`
        if [ "${role}" == "${G_BACKUPMANAGER}" -o "${role}" == "${G_BACKUPWORKFLOW}" ];then
            source ${HCP_ROOT}/microservice/ebk_openstack_vmware/script/ebackup_env.sh
            sh ${HCP_ROOT}/microservice/ebk_openstack_vmware/script/start.sh
            sleep 1
        fi
    elif [ "${process_name}" == "ebk_backup_monitor" -o "${process_name}" == "ebk_restore_monitor" -o "${process_name}" == "ebk_delete_monitor" -o "${process_name}" == "ebk_copy_monitor" -o "${process_name}" == "ebk_vmware_monitor" -o "${process_name}" == "ebk_accelerator_monitor" -o "${process_name}" == "ebk_mgr_monitor" -o "${process_name}" == "ebk_wa_chains_monitor" -o "${process_name}" == "ebk_wa_index_monitor" -o "${process_name}" == "ebk_wa_defrag_monitor" -o "${process_name}" == "ebk_fsbackup_monitor" -o "${process_name}" == "ebk_jobmanager_monitor" ];then
        local role=`cat ${G_PUBLIC_CONF_FILE} | grep "MachineRole" | cut -d'=' -f2`
        if [ "${role}" == "${G_BACKUP_SERVER}" -o "${role}" == "${G_BACKUP_PROXY}" -o "${process_name}" == "ebk_mgr_monitor" -o "${process_name}" == "ebk_jobmanager_monitor" ];then
            local serviceName=`echo -e ${process_name} | awk -F '_monitor' '{print $1}'`
            
            ls -l "${HCP_ROOT}/microservice/${serviceName}/script/ebackup_env.sh" | grep "^l" 1>/dev/null 2>/dev/null
            if [ $? -eq 0 ]
            then
                echo "${HCP_ROOT}/microservice/${serviceName}/script/ebackup_env.sh is link file."
                return 1
            fi

            source ${HCP_ROOT}/microservice/${serviceName}/script/ebackup_env.sh
                        
            ls -l "${HCP_ROOT}/microservice/${serviceName}/conf/${serviceName}.lock" | grep "^l" 1>/dev/null 2>/dev/null
            if [ $? -eq 0 ]
            then
                echo "${HCP_ROOT}/microservice/${serviceName}/conf/${serviceName}.lock is link file."
                return 1
            fi
            
            exec 3<>${HCP_ROOT}/microservice/${serviceName}/conf/${serviceName}.lock
            if [ $? -ne 0 ]
            then
                log_error "launch_process:open lock(${serviceName}.lock) failed"
                return 1
            fi 
            
            flock -xn 3
            if [ $? -ne 0 ]
            then
                log_error "launch_process:lock(${serviceName}.lock) failed"
                return 1
            fi 
                        
            ls -l "${HCP_ROOT}/microservice/${serviceName}/script/ebackup_start.sh" | grep "^l" 1>/dev/null 2>/dev/null
            if [ $? -eq 0 ]
            then
                echo "${HCP_ROOT}/microservice/${serviceName}/script/ebackup_start.sh is link file."
                return 1
            fi
            
            sh ${HCP_ROOT}/microservice/${serviceName}/script/ebackup_start.sh
            flock -u 3 
            if [ $? -ne 0 ]
            then
                log_error "launch_process:unlock(${serviceName}.lock) failed"
                return 1
            fi     
            sleep 1
        fi  
    fi 
    pm_status ${process_name}
    return $?
}

#******************************************************************#
# Function: pm_process_check
# Description: check the hcp processmonitor process status, 
#              if it is not running, start it
# Input Parameters: None
# Return :
#******************************************************************#
pm_process_check()
{   
    for processmonitor in ${pm_name}
    do 
        echo "${processmonitor}"
        pm_status ${processmonitor}
        if [ 0 -ne $? ]; then
            pm_start ${processmonitor}
        fi
    done 
    
    return 0
}

check_wether_need_alarm()
{
    local processName=$1
    if [[ "$G_HA_ROLE" == "noHA" || "$G_MACHINE_ROLE" == "$G_MACHINE_ROLE_PROXY" || "$G_MACHINE_ROLE" == "$G_MACHINE_ROLE_WORKFLOW" ]]; then
        for proc in $pm_proxy_out; do
            if [[ "$proc" == "$processName" ]]; then
                return 1
            fi
        done
    fi
    
    if [[ "$G_HA_ROLE" == "standby" ]]; then
        for proc in $pm_standby_out; do
            if [[ "$proc" == "$processName" ]]; then
                return 1
            fi
        done
    fi
    return 0
}

process_restart_check()
{   
    if [ -L "${pm_log_file}" ];then
        echo "symbol link File."
        return 1
    fi
    if [ -f ${G_UPGRADE_FLAG_FILE} ]; then
        date >>${pm_log_file}
        echo "[pm_pid_monitor] upgrade is running, so quit." >>${pm_log_file} 
        rm -rf ${pm_pid_file}
        return 1
    fi
    if [ -L "${ha_config_tmp_file}" ];then
        echo "symbol link File."
        return 1
    fi
    ## remove temp file when add ha node ##
    if [ -f ${ha_config_tmp_file} ]; then
        ha_mode=`${ha_status_sh} 2>>${pm_log_file} | grep -w -A1 "HAMode" | grep -w "double"`
        ha_active=`${ha_status_sh} 2>>${pm_log_file} | grep -wA2  "HAActive" | grep -w "active"`
        ha_standby=`${ha_status_sh} 2>>${pm_log_file} | grep -wA2  "HAActive" | grep -w "standby"`

        date >>${pm_log_file}
        if [ -n "${ha_mode}" ] && [ -n "${ha_active}" ] && [ -n "${ha_standby}" ]; then
            echo "[pm_pid_monitor] config ha finished, now rm -rf ${ha_config_tmp_file}" >>${pm_log_file}
            if [ -L "${ha_config_tmp_file}" ];then
                echo "symbol link File."
                return
            fi         
            rm -rf ${ha_config_tmp_file}
        else
            echo "[pm_pid_monitor] config ha is running...,${ha_mode}, ${ha_active}, ${ha_standby}" >>${pm_log_file}            
        fi
    fi

  
    if [ ! -f ${pm_pid_file} ]; then
        sudo ${HCP_ROOT}/bin/sudo_common_func.sh hcp_status_check 2>>${pm_log_file}| tail -1 | grep -w "running" >/dev/null
        if [ $? -ne 0 ]; then
             return 1    
        fi      
    fi

    local send_alarm_paramters=""

    for procname in ${pm_alm_nm}
    do
        local proc_pid=""
        local full_proc_name=""
        local alarm_proc_name=""
        local proc_count=0
        local old_proc_count=0    
        local old_proc_uniq_pid=0        

        ps_res=`ps -ef 2>&1`
        
        log_info "Begin check process(${procname}) whether or not restart."
        
        if [ "/bin/hcplogrotate.sh" == "${procname}" ]; then
            full_proc_name="${HCP_ROOT}${procname}"
            proc_pid=`echo -e "${ps_res}" | awk '{if ($8 == "/bin/sh" && $9 == "'${full_proc_name}'" && $3 == 1) print $2}'`
            log_info "Check Process(${procname}) status, pid is ${proc_pid}"
        else
            if [ "./ibase" == "${procname}" ] || [ "./dsware_agent" == "${procname}" ]; then
                full_proc_name="${procname}"
            elif [[ "${procname}" =~ ^/microservice/ebk_.* ]];then
                full_proc_name="${HCP_ROOT}${procname}/bin"
            elif [[ "${procname}" = "/opt/huawei-data-protection/ebackup/db/bin/gaussdb" ]];then
                full_proc_name="${procname}"
            else
                full_proc_name="${HCP_ROOT}${procname}"
            fi

            proc_pid=`echo -e "${ps_res}" | awk '{if ($8 == "'${full_proc_name}'" && $3 == 1) print $2}'`
        fi

        proc_count=`echo "${proc_pid}" | wc -l`
        if [ ${proc_count} -gt 1 ]; then
            date >>${pm_log_file}
            echo "[pm_pid_monitor] procname=${procname}, proc_count=${proc_count}" >>${pm_log_file}
            log_error "The process full name is ${full_proc_name}, proc_count=${proc_count}."
            echo -e "${ps_res}" >>${pm_log_file}            
            continue
        fi

        ls -l "${pm_pid_file}" | grep "^l" 1>/dev/null 2>/dev/null
        if [ $? -eq 0 ]
        then
            echo "${pm_pid_file} is link file."
            exit 1
        fi

        old_pid=`cat ${pm_pid_file} 2>/dev/null | awk -F',' '{if ($1 == "'${procname}'") print $2}'`

        old_proc_count=`echo "${old_pid}" | wc -l` 
        if [ ${old_proc_count} -gt 1 ]; then
            date >>${pm_log_file}
            echo "[pm_pid_monitor] procname=${procname}, old_proc_count=${old_proc_count}" >>${pm_log_file}
            echo -e "${old_pid}" >>${pm_log_file}                   
            
            log_info "The process full name is ${full_proc_name}, old_proc_count=${old_proc_count}."
            
            old_proc_uniq_pid=`echo "${old_pid}" | sed -n '1p'`             

            sed -i 's#'"${procname}"'#DELETED#;/DELETED/d' ${pm_pid_file} 
            echo "${procname},${old_proc_uniq_pid}" >>${pm_pid_file}
            echo -e "[pm_pid_monitor] procname=${procname}, pid file content `cat ${pm_pid_file}`" >>${pm_log_file}
            old_pid=${old_proc_uniq_pid}
            echo "[pm_pid_monitor] procname=${procname}, change pidfile old_pid=${old_pid}" >>${pm_log_file}
        fi
        
        log_info "Th process full path is ${full_proc_name}, pid=${proc_pid}, oldpid=${old_pid}."
        
        if [ -z "${old_pid}" ]; then
            if [ -n "${proc_pid}" ]; then 
                echo "${procname},${proc_pid}" >>${pm_pid_file}
                echo -e "[pm_pid_monitor] procname=${procname}, pid file content `cat ${pm_pid_file}`" >>${pm_log_file}
                alarm_proc_name=`echo "${procname}" | awk -F'/' '{print $NF}'`
                echo -e "[pm_pid_monitor] procname=${procname}, sending recover alarm, ps -ef res: ${ps_res}" >>${pm_log_file}
                SendProcessAbnormalAlarm 1 "${alarm_proc_name}" "${alarm_proc_name}"
                log_info "Th process(${full_proc_name}) has started, pid=${proc_pid}, oldpid=${old_pid}."
            fi
        else
            if [ -n "${proc_pid}" ]; then 
                if [ "${proc_pid}" != "${old_pid}" ]; then        
                   if [ "/ha/module/hacom/bin/ha.bin" == "${procname}" ] && [ -f ${ha_config_tmp_file} ]; then
                       sed -i "s#\(${procname}\),\(${old_pid}\)#\1,${proc_pid}#g" ${pm_pid_file}
                   else
                       ### combine alarm parameters ###
                       alarm_proc_name=`echo "${procname}" | awk -F'/' '{print $NF}'`
                       SendProcessAbnormalAlarm 0 "${alarm_proc_name}" "${alarm_proc_name}"
                       SendProcessAbnormalAlarm 1 "${alarm_proc_name}" "${alarm_proc_name}"
                       sed -i "s#\(${procname}\),\(${old_pid}\)#\1,${proc_pid}#g" ${pm_pid_file}
                       echo -e "[pm_pid_monitor] procname=${procname}, pid file content `cat ${pm_pid_file}`" >>${pm_log_file}
                       date >>${pm_log_file}
                       echo "[pm_pid_monitor] procname=${procname}, old_pid=${old_pid}, cur_pid=${proc_pid}, send_alarm=${alarm_proc_name}" >>${pm_log_file} 
                       echo -e "${ps_res}" >>${pm_log_file}
                       log_warn "Th process(${full_proc_name}) has restarted, pid=${proc_pid}, oldpid=${old_pid}."
                   fi
                fi
                log_info "The process(${full_proc_name}) is normal. pid=${proc_pid}, oldpid=${old_pid}."
            else
                ### combine alarm parameters ###
                alarm_proc_name=`echo "${procname}" | awk -F'/' '{print $NF}'`
                log_info "Current ha role:${G_HA_ROLE}, machine role:${G_MACHINE_ROLE}, process name:${procname}."
                check_wether_need_alarm $procname
                if [[ $? -ne 0 ]]; then
                    echo -e "[pm_pid_monitor] procname=${procname}, no need to send alarm, sending recover alarm, ps -ef res: ${ps_res}" >>${pm_log_file}
                    SendProcessAbnormalAlarm 1 "${alarm_proc_name}" "${alarm_proc_name}"
                    sed -i 's#'"${procname},${old_pid}"'#DELETED#;/DELETED/d' ${pm_pid_file}
                    echo -e "[pm_pid_monitor] procname=${procname}, pid file content `cat ${pm_pid_file}`" >>${pm_log_file}
                    log_info "Resume process(${alarm_proc_name}) abort alarm, current ha role:${G_HA_ROLE}, machine role:${G_MACHINE_ROLE}, oldpid=${old_pid}."
                    continue
                fi
                echo -e "[pm_pid_monitor] procname=${procname}, sending abnormal alarm, ps -ef res: ${ps_res}" >>${pm_log_file}
                SendProcessAbnormalAlarm 0 "${alarm_proc_name}" "${alarm_proc_name}"
                date >>${pm_log_file}
                echo "[pm_pid_monitor] procname=${procname}, old_pid=${old_pid}, cur_pid=${proc_pid}, send_alarm=${alarm_proc_name}" >>${pm_log_file}
                log_error "Th process(${full_proc_name}) has stoped, pid=${proc_pid}, oldpid=${old_pid}."
            fi
        fi
    done

    return 0
}

get_net_no() 
{ 
    local IP_ADDR=$1 
    local MASK_ADDR=$2 
    local ALL_NET_NO="192.168.1.0" 
     
    echo ${MASK_ADDR}|sed -e 's/[^0-9]/\ /g' | while read MK1 MK2 MK3 MK4  
    do 
        if [ "$MK4" -ne 0 ] 
        then  
            IP4=` echo ${IP_ADDR} | awk -F"." '{print $4}' ` 
            NET_NO=$(( $MK4&$IP4 )) 
            ALL_NET_NO=`echo ${IP_ADDR} | awk -F"." '{printf "%s.%s.%s.%s",$1,$2,$3,no}' no="$NET_NO" ` 
            echo "${ALL_NET_NO}"                 
        else                         
            if [ "$MK3" -ne 0 ] 
            then 
                IP3=` echo ${IP_ADDR} | awk -F"." '{print $3}' ` 
                NET_NO=$(( $MK3&$IP3 )) 
                ALL_NET_NO=`echo ${IP_ADDR} | awk -F"." '{printf "%s.%s.%s.%s",$1,$2,no,0}' no="$NET_NO" ` 
                echo "${ALL_NET_NO}"                                 
            else 
                if [ "$MK2" -ne 0 ] 
                then 
                    IP2=` echo ${IP_ADDR} | awk -F"." '{print $2}' ` 
                    NET_NO=$(( $MK2&$IP2 )) 
                    ALL_NET_NO=`echo ${IP_ADDR} | awk -F"." '{printf "%s.%s.%s.%s",$1,no,0,0}' no="$NET_NO" ` 
                    echo "${ALL_NET_NO}" 
                fi 
            fi 
        fi 
    done 
    return 0 
}

function processMonitor_mem_check()
{
    local L_CONFIG_NAME="MemSizeThreshold"
    local L_EXACT_MEM=`ps -eo pid,rss,command | grep -E "${HCP_ROOT}/bin/HCPProcessMonitor$" | grep -E -v '(grep|sh)' | awk '{print $2}'`
    local L_MAX_MEM_NUMBER=`grep $L_CONFIG_NAME ${G_HOME_PATH}/conf/hcpconf.ini | awk -F "=" '{print $2}'` 
    #the max mem unit is G.
    L_MAX_MEM=`expr $[$L_MAX_MEM_NUMBER*1024*1024*1024]`

    if [ $L_EXACT_MEM -gt $L_MAX_MEM ];then
        log_warn "HCPProcessMonitor Mem $L_EXACT_MEM is greater than $L_MAX_MEM, will be killed" 
        L_PROC_PID=`ps -eo pid,command | grep -E "${HCP_ROOT}/bin/HCPProcessMonitor$" | grep -E -v '(grep|sh)' | awk '{print $1}'` 
        kill -15 $L_PROC_PID >/dev/null 2>&1
        local L_RETRY_TIME=0
        while [ $L_RETRY_TIME -lt 3 ]
        do
            sleep 1
            if [ -e "/proc/$L_PROC_PID/cmdline" ];then
                L_RETRY_TIME=$(expr $L_RETRY_TIME + 1)           
            else
                log_info "HCPProcessMonitor Mem is killed successfully" 
                return
            fi
        done
        #if it is not been killed , kill -KILL
        kill -9 $L_PROC_PID >/dev/null 2>&1
        sleep 1
        if [ -e "/proc/$L_PROC_PID/cmdline" ];then
            log_error "HCPProcessMonitor Mem is not killed successfully" 
            return    
        else
            log_info "HCPProcessMonitor Mem is killed successfully" 
            return
        fi
    fi
}


################## restore iptables ###################

function checkout_os_type()
{
    if [[ -f "/etc/SuSE-release" ]]; then
        G_OS_TYPE="SuSE"
    fi

    if [[ -f "/etc/euleros-release" ]]
    then
        G_OS_TYPE="Euler"
        cat /etc/euleros-release | grep "2.0" | grep "SP2" 1>/dev/null 2>/dev/null
        if [ $? -eq 0 ]
        then
            G_EULER_VERSION="2.2"
        fi
    fi
}

hostname_check()
{
    local l_local_hostname=`hostname`
    local l_match_string=`cat /etc/hosts | grep ${l_local_hostname}`
    IFS_old=$IFS
    IFS=$'\n'
    for str in $l_match_string
    do
        local l_mapping_ip=`echo ${str} | awk '{print $1}'`
        local l_mapping_hostname=`echo ${str} | awk '$1="";{print $0}'`
        if [ "$IP_FLAG" == "ipv6" ];then
            echo $l_mapping_ip | egrep '^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:)))(%.+)?\s*$' > /dev/null 2>&1
            l_ret_ip=$? 
        else
            echo $l_mapping_ip | egrep '^\s*((25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])\.){3}(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])$\s*' > /dev/null 2>&1
            l_ret_ip=$?
        fi    
        l_ret_hostname=1
        IFS=$IFS_old
        for hostname in $l_mapping_hostname
        do
            if [ "$hostname"x == "$l_local_hostname"x ]
            then
                l_ret_hostname=0
                break
            fi
        done
        IFS=$'\n'
        if [[ $l_ret_ip -eq 0 && $l_ret_hostname -eq 0 ]]
        then
            # already config the mapping between ip and hostname
            IFS=$IFS_old
            return 0
        fi
    done
    # Not config the mapping between ip and hostname, add 127.0.0.2 to hosts
    l_local_hostname=`hostname`
    if [ "$IP_FLAG" == "ipv6" ];then
        echo "[::1]       ${l_local_hostname}" >> /etc/hosts
    else
        echo "127.0.0.2       ${l_local_hostname}" >> /etc/hosts
    fi
    IFS=$IFS_old
    return 0
}

chown -h ${G_HCP_PROCESS_ACCOUNT}:hcp /opt/huawei-data-protection/ebackup/logs/HCP_LogRotate.log
 
#The main function
#check hcp_pm_crontab_chk.sh status, quit if it is already running
pm_cronjob=`ps -ef | awk '{if ($8 == "/bin/sh" && $9 == "-c" && $10 == "sh" && $11 =="'${hcp_pm_crontab_chk_file}'") print $11}' | wc -l`
if [ ${pm_cronjob} -gt 1 ]; then
    log_info "hcp processmonitor is already running(pm_cronjob=${pm_cronjob}), so quit!"
    exit 0
fi

log_info "hcp processmonitor isn't running, begin to run it."
checkout_os_type
pm_process_check
#get HA role
if [ "$G_MACHINE_ROLE" = "${G_MACHINE_ROLE_SERVER}" -o "$G_MACHINE_ROLE" = "${G_MACHINE_ROLE_MANAGER}" ]
then
    log_info "ebackup node is server or manager."
    G_HA_ROLE=`$G_HA_GET_ROLE_SCRIPT 2>>${pm_log_file}`
fi
process_restart_check
sudo ${HCP_ROOT}/bin/sudo_common_func.sh ssh_port_check
float_ip_check
processMonitor_mem_check
sudo ${HCP_ROOT}/bin/sudo_common_func.sh rotate_dbtool_log

sudo ${HCP_ROOT}/bin/sudo_restore_iptable.sh

sh ${HCP_ROOT}/bin/db_sync_monitor.sh check_progress

ps -ef | grep -v "grep" | grep "storage_conn_monitor.sh" >/dev/null 2>&1
if [ $? -ne 0 ];then
    sh ${HCP_ROOT}/bin/storage_conn_monitor.sh
fi
hostname_check

